How complete is the CDC's COVID-19 case surveillance and death certificate data for deaths by race/ethnicity at the state and county levels?

February 9, 2021

In [ ]:
#@title
import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
#%load_ext google.colab.data_table
In [ ]:
#@title
# How to update the data:
# 0. You may need to copy this colab so you have your own version.
# 1. Update the cdc_table to have the latest data's suffix.
# 2. Update the date variables below to be the last case date included in the data.
#    If the CRDT doesn't have data on that exact date, choose the closest date for crdt_date.
# 3. If the last case date is after Feb 2, 2021, you'll need to upload a new version
#    of the crdt data to compare against and change the crdt_table name below.
# 4. Scatterplot max/min below in chart settings may need to be updated for more cases.
# 5. There are a few checks for the county_fips_mapping that we created due to issues with the CDC's.
#    Instructions are at https://docs.google.com/spreadsheets/d/1AVSSge7BpkbNL4PfumUZpL7hokMLjKUojtamQjNW6f0/edit?resourcekey=0-Abdprx3fy_pXikSCDV2hxw#gid=967935006.
# 6. Many/all of the tables and text are not auto-updated. If you want to do a full updated of
#    the paper including text and tables, a lot of that is done in commented out PrintSummaryStats().

project_id = 'msm-secure-data-1b'
cdc_table = '`%s.ndunlap_secure.cdc_restricted_access_20210131`' % project_id
crdt_table = '`%s.ndunlap_secure.crdt_20210203`' % project_id
date = 'DATE(2021, 01, 16)'
crdt_date = '20210117'
date_display_name = 'Jan 16'

# Provisional deaths data dates.
provisional_date = 'DATE(2021, 2, 3)'
crdt_provisional_date = 20210203
provisional_date_display_name = 'Feb 3'
provisional_deaths_state_table = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210203`'
provisional_deaths_county_table = '`msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_county_20210203`'

# Set the scatterplot max/min to better handle outliers (CA, Los Angeles).
total_cases_scale_max = 40000
county_cases_scale_max = 16000
county_cases_zoom_scale_max = 3000
cases_known_scale_max = 40000 # known race/ethnicity

# Chart settings.
scatter_height = 300
scatter_width = 300
map_height = 300
map_width = 450
us_states = alt.topo_feature(data.us_10m.url, 'states')
us_counties = alt.topo_feature(data.us_10m.url+"#", 'counties')

territories = ('PR', 'GU', 'VI', 'MP', 'AS')
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
states_to_fips = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NYC': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78}
states_to_abbreviations = {'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
race_ethnicity_combined_map = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}
In [ ]:
#@title
crdt_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
  date = %s
''' % (crdt_table, crdt_date))

nyt_states_query = ('''
SELECT
  state_name,
  state_fips_code,
  deaths as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''' % date)

nyt_counties_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s AND
  county_fips_code IS NOT NULL
''' % date)

cdc_states_query = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state
''' % cdc_table)

cdc_counties_query = ('''
SELECT
  res_state,
  res_county,
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % cdc_table)

compare_cases_unknowns_query = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state,
   race_ethnicity_combined
''' % cdc_table)

cdc_states_by_month_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as total_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2021, 1, 1) AND
  res_state in ('AK', 'CA', 'CT', 'DE', 'GA', 'LA', 'MD', 'ND', 'NY', 'PA', 'RI')
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_states_by_month_known_or_na_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as known_or_na_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2021, 1, 1) AND
  race_ethnicity_combined != 'Unknown' AND
  race_ethnicity_combined != 'Missing'
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_overall_query = ('''
SELECT
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   1
''' % cdc_table)

cdc_provisional_deaths_state_overall_query = ('''
SELECT
  *
FROM
  `msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210203`
WHERE
  Indicator = "Distribution of COVID-19 deaths (%)" AND
  State = 'United States'
''')

cdc_provisional_deaths_state_counts_overall_query = ('''
SELECT
  *
FROM
  `msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210203`
WHERE
  Indicator = "Count of COVID-19 deaths" AND
  State = 'United States'
''')

county_fips_mapping_query = ('''
SELECT
*
FROM
  `msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''')

acs_population_data_query = ('''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''')
In [ ]:
#@title
# Provisional deaths data queries.
cdc_provisional_deaths_county_query = ('''
SELECT
  *
FROM %s
WHERE
  Indicator = "Distribution of COVID-19 deaths (%%)"
''' % provisional_deaths_county_table)

cdc_provisional_deaths_state_query = ('''
SELECT
  *
FROM %s
WHERE
  Indicator = "Distribution of COVID-19 deaths (%%)" AND
  State != 'United States'
''' % provisional_deaths_state_table)

cdc_provisional_deaths_state_counts_query = ('''
SELECT
  *
FROM %s
WHERE
  Indicator = "Count of COVID-19 deaths" AND
  State != 'United States'
''' % provisional_deaths_state_table)

crdt_provisional_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM %s
WHERE
  date = %s
''' % (crdt_table, crdt_provisional_date))

nyt_counties_provisional_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s  AND
  county_fips_code IS NOT NULL
''' % provisional_date)
In [ ]:
#@title
def FieldAnalysis(project_id, table, field_list):
  dict = {}
  for field in field_list:
      dict[field] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_series = []
  value_series = []
  percent_series = []

  for field in field_list:
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    WHERE
      death_yn = 'Yes'
    GROUP BY
      %s
    ''')
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    field_display_name = {
        'cdc_case_earliest_dt': 'CDC earliest case date',
        'current_status': 'Case status',
        'res_state': 'State',
        'res_county': 'County',
        'sex': 'Sex',
        'age_group': 'Age',
        'race_ethnicity_combined': 'Race/Ethnicity'}

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    if '' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc[''].cases
    #if field_unknowns_df.index.isnull().any():
    #  missing_count += field_unknowns_df.loc[field_unknowns_df.index.isnull()].cases
    unknowns.loc['Missing', field] = missing_count / field_unknowns_df.cases.sum()

    if 'Unknown' in field_unknowns_df.index:
      unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases / field_unknowns_df.cases.sum()
    if 'NA' in field_unknowns_df.index:
      unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases / field_unknowns_df.cases.sum()
    unknowns.loc['Known', field] = 1 - (unknowns.loc['Missing', field] +
                                        unknowns.loc['Unknown', field] +
                                        unknowns.loc['NA', field])
    field_series.extend([field_display_name.get(field, field)] * 4)
    value_series.extend(['Known', 'Supressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field],
                           unknowns.loc['NA', field],
                           unknowns.loc['Unknown', field],
                           unknowns.loc['Missing', field]])
  test = pd.DataFrame.from_dict({'field': field_series,
                               'value': value_series,
                               'percent': percent_series})
  return alt.Chart(test).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field', sort='x', title='Field'),
      color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
      ]
  )

Background

This analysis picks up where the case data completeness analysis left off to evaluate COVID-19 deaths data with race/ethnicity at the state and county levels.

While we only have two options for COVID-19 case data with race/ethnicity, we have four options for COVID-19 deaths data with race/ethnicity:

  1. Covid Racial Data Tracker: States, updated twice a week until March 7, 2021
  2. CDC Case Surveillance Restricted Access data: States and counties, updated monthly
  3. CDC Provisional deaths data from death certificates: States and counties, updated weekly
  4. American Public Media Research Lab: States, updated monthly

In the case data completeness analysis, we compared the CDC Case Surveillance dataset to the Covid Racial Data Tracker (CRDT). In this analysis, we'll compare the CDC Case Surveillance dataset to the CRDT and the CDC Provisional deaths datasets. We will not analyze the American Public Media Research Lab dataset because they use a mixture of data from state public health websites for most states, as the CRDT does, and CDC Provisional Deaths data in 14 states where the data are unavailable or less complete than on public health websites.

There are completeness issues with using the CDC case dataset for deaths:

  • Only 80% of total deaths up to January 16 are included (78K deaths are missing)
  • Of those deaths, 78% have known race/ethnicity (70K out of 309K deaths are missing race/ethnicity)

For the 239K deaths where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:

In [ ]:
#@title
overall_df = pd.io.gbq.read_gbq(cdc_overall_query, project_id=project_id)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
overall_df = overall_df.set_index('race_ethnicity_combined')

chart_denominator = 1000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
         overall_df.cases['black_cases'] / chart_denominator,
         overall_df.cases['white_cases'] / chart_denominator,
         overall_df.cases['asian_cases'] / chart_denominator,
         overall_df.cases['nhpi_cases'] / chart_denominator,
         overall_df.cases['aian_cases'] / chart_denominator,
         overall_df.cases.sum() / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})

bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.2%'), scale=alt.Scale(domain=(0, .0022)), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
                  alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who died from COVID-19 based on Incomplete CDC Case Data as of %s' % date_display_name)

bars.display()

#alt.concat(bars).properties(
#    title=alt.TitleParams(
#        ['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
#        baseline='bottom',
#        dy=20,
#        orient='bottom',
#        fontWeight='normal',
#        fontSize=11
#    )
#).display()

But the chart above is based on incomplete data. For example, the CDC dataset says that 0% of deaths in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 46% of deaths (17K people) as of January 27.

If all 70K deaths with missing race/ethnicity were Hispanic/Latino people, the percent of Hispanic/Latinos in the U.S. who died from COVID-19 would go from 0.05% to 0.17% — a 3x increase. If all 70K deaths with missing race/ethnicity were Black people, the percent of Black people who died from COVID-19 would go from 0.09% to 0.26% — also a 3x increase. While these extreme scenarios are unlikely, they show us why missing race/ethnicity data is preventing us from truly understanding and addressing the disparities in the COVID-19 pandemic in the U.S.

We can get a more complete view of the deaths within each race/ethnicity group from the CDC Provisional Deaths dataset.

In [ ]:
#@title

overall_provisional_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_overall_query, project_id=project_id)
overall_provisional_df
overall_provisional_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_overall_query, project_id=project_id)
overall_provisional_counts_df.head(15)

overall_provisional_df['cdc_known_cases_percent'] = round((
    overall_provisional_df.Non_Hispanic_White.fillna(0) +
    overall_provisional_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    overall_provisional_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    overall_provisional_df.Non_Hispanic_Asian.fillna(0) +
    overall_provisional_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    overall_provisional_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
overall_provisional_counts_df['cdc_known_cases'] = round(
    overall_provisional_counts_df.Non_Hispanic_White.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_Asian.fillna(0) +
    overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    overall_provisional_counts_df.Hispanic_or_Latino.fillna(0))
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
total_cases_including_unknowns = round(
    overall_provisional_counts_df.cdc_known_cases[0] / overall_provisional_df.cdc_known_cases_percent[0], 0)

chart_denominator = 1000
cases_list = [overall_provisional_counts_df.Hispanic_or_Latino[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_Black_or_African_American[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_White[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_Asian[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander[0] / chart_denominator,
              overall_provisional_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native[0] / chart_denominator,
              total_cases_including_unknowns / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})

bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.2%'), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.3%', title='Percent who died'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
                  alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who died from COVID-19 based on CDC Provisional Deaths Data as of %s' % provisional_date_display_name
)

bars.display()

All the percentages are larger partly due to the more recent update time. In these results, 0.126% of Hispanics/Latinos have died from COVID-19, while 0.154% of Black people have died from COVID-19. That translates to 126 deaths out of every 100K Hispanic/Latino people and 154 deaths out of every 100K Black people. The group with the highest rate of deaths is American Indian/Alaska Native with 216 deaths per 100K people. These results are more similar to the Covid Racial Data Tracker and American Public Media Research Lab than the CDC case data.

We'll examine which states and counties have data that are as reliable as state public health websites. If we can rely on the CDC's dataset for some states and counties, that could reduce the amount of manual data collection needed to replace the Covid Tracking Project's dataset.

Overview

The goal of this analysis is to assess the completeness of the CDC's Case Surveillance Restricted Access dataset and the CDC's Provisional Deaths datasets to evaluate their feasibility in examining disparities in race/ethnicity for COVID-19 deaths at the state and county levels.

The top-level data completeness findings for the CDC case dataset are:

  1. Data Overview: The field indicating if the person died or not is only known for 50% of cases. For cases where the person died, race/ethnicity was known for 78% of deaths, as opposed to 98%-100% for all the other fields below.
In [ ]:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
FieldAnalysis(project_id, cdc_table, field_list).display()
  1. Total Death Counts: The CDC dataset contains 80% of the total deaths reported in the Covid Tracking Project (CTP). There's high variability at the state level with 10 states reporting fewer than 10% of deaths and several reporting 0%.
  2. Deaths with Race/Ethnicity: Race/ethnicity data is available for 78% of deaths in the CDC dataset compared to 93% in the Covid Racial Data Tracker. The Covid Racial Tracker has less variability with all but one state reporting at least 70% of deaths with race/ethnicity.

We will also evaluate a better source for deaths data that comes from death certificates, as reported in the CDC Provisional Deaths State dataset, the CDC Provisional Deaths County dataset, and on this CDC dashboard. The CDC Provisional Deaths State dataset contains 96% of the total deaths in the Covid Tracking Project and 99% of the deaths have race/ethnicity.

We used a composite measurement to evaluate the CDC case dataset and provisional deaths datasets' completeness as compared to the Covid Racial Data Tracker (CRDT) at the state level. We calculated the percent of total deaaths that have race/ethnicity data and broke it down into its two separate components: the percentage of expected death counts included and the percentage of deaths included with race/ethnicity. We looked at the number of states that had at least 50% of total death counts with race/ethnicity and those that had at least 85%.

In [ ]:
#@title
# Manually update these fields based on the latest CDC data.
# SELECT
# count(*) as count
# FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20210131`
# WHERE death_yn = 'Yes'
# AND race_ethnicity_combined NOT IN ('Unknown', 'NA', 'Missing')
# https://covidtracking.com/data/national
# County data calculated in Counties: CDC vs. NYT section.
row_names = [
    '% of CTP deaths count',
    '% with race/ethnicity',
    'Composite % of total with race/ethnicity',
    'Number of states with composite > 85%',
    '(as a percent of all states)',
    'Number of states with composite > 50%',
    '(as a percent of all states)',
]
crdt_metadata = [
    '100%', 
    '93%',
    '93%',
    '44',
    '(86%)',
    '50',
    '(98%)',
]
cdc_metadata = [
    '80%', 
    '78%',
    '62%',
    '8',
    '(17%)',
    '33',
    '(69%)',
]
cdc_provisional_metadata = [
    '96%', 
    '99%',
    '95%',
    '40',
    '(80%)',
    '50',
    '(98%)',
]
table_data = {'CRDT': crdt_metadata, 'CDC Case Surveillance': cdc_metadata, 'CDC Provisional Deaths': cdc_provisional_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[ ]:
CRDT CDC Case Surveillance CDC Provisional Deaths
% of CTP deaths count 100% 80% 96%
% with race/ethnicity 93% 78% 99%
Composite % of total with race/ethnicity 93% 62% 95%
Number of states with composite > 85% 44 8 40
(as a percent of all states) (86%) (17%) (80%)
Number of states with composite > 50% 50 33 50
(as a percent of all states) (98%) (69%) (98%)

The provisional deaths dataset offers an excellent alternative to the case dataset at the state level. At the county level, the provisional deaths dataset only includes counties with 100 or more deaths, so it contains fewer counties than the case data, but those counties account for a larger percentage of the U.S. population. We will discuss the tradeoffs for county-level datasets in more detail later on in this analysis.

What we didn't include in this report:

Completeness Analysis

Total Death Counts

Baseline: NYT vs. CRDT

To get a baseline of how much we could expect the CDC death counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT death counts are equal.

In [ ]:
#@title
def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = us_states
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = us_counties

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(us_states).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(us_states).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def PrintSummaryStats(chart_df, field='percent'):
  plus_minus_15_df = chart_df[chart_df[field] >= .85]
  plus_minus_15_df = plus_minus_15_df[plus_minus_15_df[field] <= 1.15]
  print('between +/-15%: ', len(plus_minus_15_df), round(len(plus_minus_15_df) / len(chart_df), 2))
  plus_minus_50_df = chart_df[chart_df[field] >= .50]
  plus_minus_50_df = plus_minus_50_df[plus_minus_50_df[field] <= 1.50]
  print('between +/-50%: ', len(plus_minus_50_df), round(len(plus_minus_50_df) / len(chart_df), 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())
In [ ]:
#@title
crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)
crdt_df.set_index('state', inplace=True)

nyt_states_df = pd.io.gbq.read_gbq(nyt_states_query, project_id=project_id)
nyt_states_df.state_fips_code.unique()
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
for territory in nyt_territories:
  nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)

crdt_df.reset_index(inplace=True)
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_df.set_index('state_fips_code', inplace=True)
nyt_crdt_merged_df = nyt_states_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')

nyt_crdt_merged_df['percent'] = round(nyt_crdt_merged_df.nyt_cases / nyt_crdt_merged_df.crdt_cases, 2)
nyt_crdt_merged_df
nyt_crdt_merged_df.reset_index(inplace=True)


below_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent < .85]) / len(nyt_crdt_merged_df)
above_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent > 1.15]) / len(nyt_crdt_merged_df)
#print('between +/-15%: ', round(1 - below_15 - above_15, 2))
#nyt_crdt_merged_df.percent.describe()
In [ ]:
#@title
nyt_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NYT to CRDT'},
}
nyt_crdt_title = 'Ratio of NYT to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    nyt_crdt_merged_df, nyt_crdt_fields_dict, nyt_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

States: CDC vs. CRDT

We can see below that the CDC death counts differ from the CRDT death counts much more drastically than the NYT did.

In [ ]:
#@title
cdc_states_df = pd.io.gbq.read_gbq(cdc_states_query, project_id=project_id)
cdc_states_df.rename(columns={'res_state': 'state'}, inplace=True)
cdc_states_df.set_index('state', inplace=True)

crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)

for territory in territories:
  crdt_df = crdt_df[crdt_df.state != territory]

crdt_df.set_index('state', inplace=True)
cdc_crdt_merged_df = cdc_states_df.join(crdt_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_crdt_merged_df.reset_index(inplace=True)
cdc_crdt_merged_df['state_fips_code'] = cdc_crdt_merged_df.state
cdc_crdt_merged_df = cdc_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_crdt_merged_df['percent'] = round(cdc_crdt_merged_df.cdc_cases / cdc_crdt_merged_df.crdt_cases, 4)

# PrintSummaryStats(cdc_crdt_merged_df)
In [ ]:
#@title
# CDC vs. NYT county

df = pd.io.gbq.read_gbq(cdc_counties_query, project_id=project_id)
for territory in territories:
  df = df[df.res_state != territory]

df_county_fips_map = pd.io.gbq.read_gbq(county_fips_mapping_query, project_id=project_id)

df_county_fips_map.cdc_county = df_county_fips_map.cdc_county.str.lower()
df_county_fips_map['state_county'] = df_county_fips_map.state + '-' + df_county_fips_map.cdc_county
df_county_fips_map['state_county'] = df_county_fips_map.state_county.astype('string').str.strip()
df_county_fips_map.set_index('state_county', inplace=True)
In [ ]:
#@title
# Concatenate the state and county names because county names are not unique across states.
df.res_county = df.res_county.str.lower()
df['state_county'] = df.res_state + '-' + df.res_county
df['state_county'] = df.state_county.astype('string').str.strip()
df.set_index('state_county', inplace=True)
df['race_ethnicity_combined'] = df.race_ethnicity_combined.astype('string').str.strip()

df = df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
In [ ]:
#@title
mismatches_df = df.join(df_county_fips_map, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
#print(mismatches_df.cases.sum())
#print('vs. 60363 with NULL county_fips_code')
# SELECT 
#count(*) as total_cases,
#FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`
#WHERE county_fips_code IS NULL
In [ ]:
#@title
merged_df = df.join(df_county_fips_map, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')

# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
crosstab_df = pd.crosstab(merged_df['county_fips'], merged_df.race_ethnicity_combined, values=merged_df.cases, aggfunc=sum,
                          margins=True,
                          margins_name='total_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['county_fips'] = crosstab_df.county_fips.astype(int)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.na_cases.fillna(0) - crosstab_df.unknown_cases.fillna(0)
In [ ]:
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
df_acs_name_lookup = pd.io.gbq.read_gbq(acs_population_data_query, project_id=project_id)

df_acs_name_lookup['state_county'] = df_acs_name_lookup.county.astype('string').str.strip() + ', ' + df_acs_name_lookup.state.astype('string').str.strip()
df_acs_name_lookup.drop(columns=['state', 'county'], inplace=True)
df_acs_name_lookup.set_index('county_fips', inplace=True)

county_chart_df = crosstab_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_df.county_fips = county_chart_df.county_fips.astype(int)

#print(len(county_chart_df))
#print(county_chart_df.total_pop.sum())
#print(county_chart_df.total_pop.sum() / 324697795)  # Population covered in these counties
In [ ]:
#@title

nyt_counties_df = pd.io.gbq.read_gbq(nyt_counties_query, project_id=project_id)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)

county_chart_df.set_index('county_fips', inplace=True)
nyt_merged_df = county_chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_merged_df = nyt_merged_df.reset_index()
nyt_merged_df['percent'] = round(nyt_merged_df.total_cases / nyt_merged_df.nyt_cases, 2)

#PrintSummaryStats(nyt_merged_df)
In [ ]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

Counties: CDC vs. NYT

We can do the same analysis at the county level using the CDC vs. NYT data.

In [ ]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC Deaths to NYT Deaths by County as of %s' % date_display_name
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'

scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, county_cases_scale_max, scatter_height, scatter_width, 'county', 'ratio'
)
zoom_scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, zoom_cdc_nyt_title, county_cases_zoom_scale_max, scatter_height, scatter_width, 'county', 'ratio'
)

(scatter | zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [ ]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC Deaths to NYT Deaths by County as of %s' % date_display_name

cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height, map_width, 'county', 'ratio'
)
cdc_crdt_map = CreateMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, map_height, map_width, 'state', 'ratio'
)

(cdc_crdt_map | cdc_nyt_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

Notes:

  • The legend only goes to 2.0, and all counties with a larger ratio are shown in the same dark blue color.
  • States can have deaths but no counties with deaths because county names can be suppressed for privacy reasons.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

Deaths with Race/Ethnicity

States and Counties: CDC

In [ ]:
#@title
states_df = pd.io.gbq.read_gbq(compare_cases_unknowns_query, project_id=project_id)
for state in ('Unknown', 'NA', 'OCONUS'):
  states_df = states_df[states_df.res_state != state]

states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': {
    'Asian, Non-Hispanic': 'cdc_known_cases',
    'Black, Non-Hispanic': 'cdc_known_cases',
    'White, Non-Hispanic': 'cdc_known_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
    'Hispanic/Latino': 'cdc_known_cases',
    'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
    'Missing': 'cdc_unknown_cases',
    'Unknown': 'cdc_unknown_cases',
    'NA': 'cdc_na_cases',
    }})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
In [ ]:
#@title
crosstab_df = pd.crosstab(states_df['state'], states_df.race_ethnicity_combined, values=states_df.cdc_cases, aggfunc=sum,
                          margins=True,
                          margins_name='cdc_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['cdc_known_or_na_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df['cdc_known_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df

crdt_merged_df = crosstab_df.join(crdt_df, on="state", how='inner', lsuffix='_left', rsuffix='_right')
crdt_merged_df.reset_index(inplace=True)
crdt_merged_df['state_fips_code'] = crdt_merged_df.state
crdt_merged_df = crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_merged_df['cdc_known_cases_percent'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['cdc_known_or_na_cases_percent'] = round(crdt_merged_df.cdc_known_or_na_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['percent'] = round(crdt_merged_df.cdc_cases / crdt_merged_df.crdt_cases, 4)
crdt_merged_df['percent_known_cases'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.crdt_known_race_cases, 4)

crdt_merged_df_no_ny = crdt_merged_df[crdt_merged_df.state != 'NY']
#PrintSummaryStats(crdt_merged_df_no_ny)

When evaluating the percent of deaths that report on race/ethnicity in the CDC dataset, we also need to consider the 2% of overall deaths with race/ethnicity that were suppressed due to privacy reasons. We should give states and counties credit for reporting race/ethnicity data for those deaths even if we aren't able to use it due to privacy suppression. Below, the maps on the top left shows the percent of deaths with known race/ethnicity and the map on the top right shows the percent of deaths with known or suppressed race/ethnicity. The maps on the bottom show the same information at the county level.

In [ ]:
#@title

chart_df = county_chart_df.copy(deep=True)
chart_df.reset_index(inplace=True)
chart_df.county_fips = chart_df.county_fips.astype(int)
chart_df['percent_known_cases'] = round(chart_df.total_known_cases / chart_df.total_cases, 2)
chart_df['total_known_or_na_cases'] = chart_df.total_known_cases.fillna(0) + chart_df.na_cases.fillna(0)
chart_df['percent_known_or_na_cases'] = round(chart_df.total_known_or_na_cases / chart_df.total_cases, 2)
In [ ]:
#@title
cdc_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_known_state_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_state_map = CreateMap(
    crdt_merged_df, cdc_known_state_fields_dict, cdc_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

cdc_known_or_na_state_fields_dict = {
    'x': {'name': 'cdc_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_state_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_state_map = CreateMap(
    crdt_merged_df, cdc_known_or_na_state_fields_dict, cdc_known_or_na_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(cdc_known_state_map | cdc_known_or_na_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [ ]:
#@title
cdc_known_county_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

cdc_known_or_na_county_fields_dict = {
    'x': {'name': 'total_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_county_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_map | cdc_known_or_na_county_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [ ]:
#@title
#PrintSummaryStats(crdt_merged_df, field='cdc_known_cases_percent')
#PrintSummaryStats(crdt_merged_df, field='cdc_known_or_na_cases_percent')
#tuple(crdt_merged_df[crdt_merged_df.cdc_known_or_na_cases_percent <= .5].state)

Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.

States: CDC vs. CRDT

How does the CDC dataset compare to the CRDT dataset, which is the most up-to-date aggregate data we have for race/ethnicity at the state level? Overall, 93% of the deaths in the CRDT dataset have known race/ethnicity compared to 78% in the CDC dataset (80% with suppressed).

We may even be undercounting the 93% of deaths with known race/ethnicity in the CRDT data because of the non-standard ways that each state reports on race/ethnicity, as described in this Covid Racial Data Tracker analysis. If a state uses a combined race/ethnicity field, then it's a straightforward comparison to the CDC's combined race/ethnicity field. If a state uses separate fields for race/ethnicity, then we still use the number of people with known race within each state because all of the race categories will also contain Hispanic/Latino people. We could potentially be undercounting the number of people with known race/ethnicity in the CRDT if there are people who have unknown race but known ethnicity. If we adjusted the numbers in those cases, it would make the CRDT percentages look even better in comparison to the CDC data.

In [ ]:
#@title
crdt_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name
crdt_known_map = CreateMap(
    cdc_crdt_merged_df, crdt_known_state_fields_dict, crdt_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_known_map | cdc_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [ ]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC to CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name

CreateScatterPlotAndMap(
    crdt_merged_df, fields_dict, title, cases_known_scale_max, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [ ]:
#@title
#print('CRDT known % ', crdt_merged_df.crdt_known_race_cases.sum() / crdt_merged_df.crdt_cases.sum())
#PrintSummaryStats(cdc_crdt_merged_df, field='crdt_known_race_cases_percent')

Better Data from Death Certificates

The CDC has an alternative public source for death data with race/ethnicity at the state and county levels that comes from death certificates. This dataset is more complete than the case surveillance dataset in every way except that the county-level dataset only contains about one fourth of the counties in the CDC case data. The reason for this is that the CDC Provisional Deaths County dataset only includes counties with 100 or more deaths. The CDC Provisional Deaths State dataset also has a limit of 100 or more deaths per state, but all states had reached that threshold by Feb 3, 2021.

The table below summarizes the completeness of the CDC Case dataset vs. the CDC Provisional Deaths datasets for counties and states.

In [ ]:
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
    'Update frequency',
    'Latest deaths date as of Feb 9, 2021',
    'Deaths in dataset as of date',
    'Deaths in CTP as of date',
    '(as a % of CTP)',
    'Number of counties',
    '(as a % of all counties)',
    'Population in those counties',
    '(as a % of total U.S population – States + D.C.)',
    'Deaths with known race/ethnicity',
    '(as a % of deaths in dataset)',
    'Access',
    'Limitations'
]
cdc_deaths_county_metadata = [
    'Weekly',
    'Feb 3, 2021',
    '368K',
    '441K',
    '(84%)',
    '644',
    '(20%)',
    '253K',
    '(78%)',
    '362K',
    '(98%)',
    'Public',
    'Cumulative, counties >= 100 deaths'
]
cdc_deaths_state_metadata = [
    'Weekly',
    'Feb 3, 2021',
    '421K',
    '441K',
    '(96%)',
    '-',
    '-',
    '-',
    '-',
    '418K',
    '(99%)',
    'Public',
    'Cumulative'
]
cdc_cases_metadata = [
    'Monthly', 
    'Jan 16, 2021',
    '309K',
    '386K',
    '(80%)',
    '2,378',
    '(76%)',
    '280M',
    '(86%)',
    '239K',
    '(78%)',
    'Restricted',
    'Data completeness issues'
]
table_data = {'CDC Case Surveillance': cdc_cases_metadata,
              'CDC Provisional Deaths County': cdc_deaths_county_metadata,
              'CDC Provisional Deaths State': cdc_deaths_state_metadata,
}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[ ]:
CDC Case Surveillance CDC Provisional Deaths County CDC Provisional Deaths State
Update frequency Monthly Weekly Weekly
Latest deaths date as of Feb 9, 2021 Jan 16, 2021 Feb 3, 2021 Feb 3, 2021
Deaths in dataset as of date 309K 368K 421K
Deaths in CTP as of date 386K 441K 441K
(as a % of CTP) (80%) (84%) (96%)
Number of counties 2,378 644 -
(as a % of all counties) (76%) (20%) -
Population in those counties 280M 253K -
(as a % of total U.S population – States + D.C.) (86%) (78%) -
Deaths with known race/ethnicity 239K 362K 418K
(as a % of deaths in dataset) (78%) (98%) (99%)
Access Restricted Public Public
Limitations Data completeness issues Cumulative, counties >= 100 deaths Cumulative

The CDC Provisional Deaths datasets are more complete at the U.S. level on all measures except for the number of counties contained in the county-level data. The American Public Media Research Lab found that the CDC Provisional Deaths State dataset was actually more complete than public health website data for 14 states.

Total Death Counts

In [ ]:
#@title

cdc_provisional_deaths_state_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_query, project_id=project_id)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.replace(
    to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_query, project_id=project_id)
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.replace(
    to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_df['cdc_known_cases_percent'] = round((
    cdc_provisional_deaths_state_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_state_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
cdc_provisional_deaths_state_counts_df['cdc_known_cases'] = round(
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Hispanic_or_Latino.fillna(0))
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.set_index('State')
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.set_index('State')
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.join(
    cdc_provisional_deaths_state_counts_df, on="State", how='inner', lsuffix='_left', rsuffix='_right')
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
cdc_provisional_deaths_state_df['cdc_cases'] = round(
    cdc_provisional_deaths_state_df.cdc_known_cases / cdc_provisional_deaths_state_df.cdc_known_cases_percent, 0)
# Comnbine the case counts for NY and New York City, recalculate percentage, remove New York City. 
cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] = (
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] +
    cdc_provisional_deaths_state_df.loc['New York City', 'cdc_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] = (
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] +
    cdc_provisional_deaths_state_df.loc['New York City', 'cdc_known_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases_percent'] = round(
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] /
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'], 4)
cdc_provisional_deaths_state_df.drop(['New York City'], inplace=True)
cdc_provisional_deaths_state_df.reset_index(inplace=True)
cdc_provisional_deaths_state_df.rename(columns={'State': 'state'}, inplace=True)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df[
    ['state', 'cdc_cases', 'cdc_known_cases', 'cdc_known_cases_percent']].copy()

crdt_provisional_df = pd.io.gbq.read_gbq(crdt_provisional_query, project_id=project_id)
crdt_provisional_df.set_index('state', inplace=True)
cdc_provisional_crdt_merged_df = cdc_provisional_deaths_state_df.join(crdt_provisional_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_provisional_crdt_merged_df.reset_index(inplace=True)
cdc_provisional_crdt_merged_df['state_fips_code'] = cdc_provisional_crdt_merged_df.state
cdc_provisional_crdt_merged_df = cdc_provisional_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_provisional_crdt_merged_df['percent'] = round(cdc_provisional_crdt_merged_df.cdc_cases / cdc_provisional_crdt_merged_df.crdt_cases, 4)
cdc_provisional_crdt_merged_df['percent_known_cases'] = round(cdc_provisional_crdt_merged_df.cdc_known_cases / cdc_provisional_crdt_merged_df.crdt_known_race_cases, 4)

# Counties data

df_acs_name_lookup = pd.io.gbq.read_gbq(acs_population_data_query, project_id=project_id)
df_acs_name_lookup.set_index('county_fips', inplace=True)

cdc_provisional_deaths_county_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_county_query, project_id=project_id)

cdc_provisional_deaths_county_df['county_fips'] = cdc_provisional_deaths_county_df.FIPS_Code
cdc_provisional_deaths_county_df.set_index('county_fips', inplace=True)
cdc_provisional_deaths_county_df['state_county'] = cdc_provisional_deaths_county_df.County_Name + ', ' + cdc_provisional_deaths_county_df.State
cdc_provisional_deaths_county_df['total_known_cases'] = round((
    cdc_provisional_deaths_county_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Black.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_county_df.Hispanic.fillna(0)) * cdc_provisional_deaths_county_df.COVID_19_Deaths, 0)

county_chart_provisional_df = cdc_provisional_deaths_county_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_provisional_df.reset_index(inplace=True)
county_chart_provisional_df.county_fips = county_chart_provisional_df.county_fips.astype(int)

nyt_counties_provisional_df = pd.io.gbq.read_gbq(nyt_counties_provisional_query, project_id=project_id)
nyt_counties_provisional_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_provisional_df.county_fips.unique()
nyt_counties_provisional_df['county_fips'] = nyt_counties_provisional_df.county_fips.astype(int)
nyt_counties_provisional_df.set_index('county_fips', inplace=True)

county_chart_provisional_df.set_index('county_fips', inplace=True)
nyt_merged_provisional_df = county_chart_provisional_df.join(nyt_counties_provisional_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
nyt_merged_provisional_df = nyt_merged_provisional_df.reset_index()
nyt_merged_provisional_df['percent'] = round(nyt_merged_provisional_df.COVID_19_Deaths / nyt_merged_provisional_df.nyt_cases, 4)
nyt_merged_provisional_df['percent_known_cases'] = round(nyt_merged_provisional_df.total_known_cases / nyt_merged_provisional_df.COVID_19_Deaths, 4)

#print('total deaths: ', nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print('total counties: ', len(county_chart_provisional_df))
#print('as % of counties: ', len(county_chart_provisional_df) / 3143)
#print('population in counties: ', county_chart_provisional_df.total_pop.sum())
#print('as % of total population: ', county_chart_provisional_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print('known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum())
#print('% known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum() / nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print(0.55 * 324697795) # NYT population
#print('state file deaths: ', cdc_provisional_crdt_merged_df.cdc_cases.sum())
#print('state file known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum())
#print('state file % known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum() / cdc_provisional_crdt_merged_df.cdc_cases.sum())
In [ ]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC Provisional Deaths to CRDT Deaths as of %s' % provisional_date_display_name

CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()
In [ ]:
#@title
cdc_nyt_provisional_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_provisional_title = 'Ratio of CDC Provisional Deaths to NYT Deaths as of %s' % provisional_date_display_name

scatter = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, 18000, scatter_height, scatter_width, map_width, 'county', 'ratio'
)

(scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Deaths with Race/Ethnicity

In the charts and maps below, we can see that the CDC Provisional Deaths datasets have a strikingly high percentage of deaths with known race/ethnicity. The state dataset had both counts and percentages of deaths from COVID-19 within each race/ethnicity group but not a total deaths count. There is an "Other" category that includes "More than one race or Unknown," and we treated that entire category as Unknown race/ethnicity. The percentages for all the categories including Other didn't always add up to 100%, so we treated the remaining percentage as Unknowns as well (data for a group can be suppressed if that group had fewer than 10 deaths). The county dataset included only percentages of deaths from COVID-19 within each race/ethnicity group, but it did include a total count per county so that we could calculate counts.

In [ ]:
#@title

crdt_provisional_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_provisional_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
crdt_provisional_known_map = CreateMap(
    cdc_provisional_crdt_merged_df, crdt_provisional_known_state_fields_dict, crdt_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
cdc_provisional_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_provisional_known_state_title = 'CDC Provisional Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_provisional_known_state_map = CreateMap(
    cdc_provisional_crdt_merged_df, cdc_provisional_known_state_fields_dict, cdc_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_provisional_known_map | cdc_provisional_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [ ]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC Provisional to CRDT Deaths with Known Race/Ethnicity'

CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, fields_dict, title, 40000, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [ ]:
#@title
cdc_known_county_provisional_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_provisional_title = 'CDC Provisional Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_known_county_provisional_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_provisional_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

What State and County Data are Usable?

How can states and counties improve their completeness for race/ethnicity data, especially when compared to the more reliable and up-to-date aggregate data that come from public health websites, as collected by the CRDT and NYT?

There are two ways in which states can improve the data they send to the CDC:

  1. Increase the total deaths reported to get closer to the aggregate data.
  2. Increase the percentage of deaths reported with known race/ethnicity to get closer to 100%.

In the Total Death Counts section above, we identified the states and counties with the biggest discrepancies relative to aggregate data. In the Deaths with Race/Ethnicity section, we looked at the percentage of deaths within each state and county that have race/ethnicity data.

The charts below show those two components together; the scatterplots show (1) the CDC death counts as a percentage of the CRDT/NYT total death counts on the y-axis, and (2) the percentage of CDC deaths with known race/ethnicity on the x-axis. The colors of the dots and on the map show the product of those two numbers, which is the percentage of expected total deaths that have race/ethnicity in the CDC dataset.

The scatterplots below can help us diagnose the issues in each state or county:

  • Bottom left quadrant: Low percentage of deaths reported, low reporting of race/ethnicity (and/or high suppression).
  • Top left quadrant: Mid-to-high percentage of deaths reported, low reporting of race/ethnicity (and/or high suppression).
  • Bottom right quadrant: Low percentage of deaths reported, mid-to-high reporting of race/ethnicity (and/or high suppression).
  • Top right quadrant: Mid-to-high percentage of deaths reported, mid-to-high reporting of race/ethnicity (and/or high suppression).
In [ ]:
#@title
nyt_cdc_known_merged_df = chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_cdc_known_merged_df.reset_index(inplace=True)
nyt_cdc_known_merged_df['percent'] = round(nyt_cdc_known_merged_df.total_cases / nyt_cdc_known_merged_df.nyt_cases, 2)
In [ ]:
#@title
crdt_merged_df['percent_max_100'] = crdt_merged_df.percent.clip(upper=1)
crdt_merged_df['percent_reccs'] = crdt_merged_df.percent_max_100 * crdt_merged_df.cdc_known_cases_percent
state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name

scatter = CreateScatterPlotAndMap(
    crdt_merged_df, state_reccs_fields_dict, state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [ ]:
#@title
nyt_cdc_known_merged_df['percent_max_100'] = nyt_cdc_known_merged_df.percent.clip(upper=1)
nyt_cdc_known_merged_df['percent_reccs'] = nyt_cdc_known_merged_df.percent_max_100 * nyt_cdc_known_merged_df.percent_known_cases
county_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_title = state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name


scatter = CreateScatterPlotAndMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

In the CDC Provisional Deaths datasets, almost all states and counties are in the top right quadrant, which means that they have a mid-high percentage of total deaths and a mid-high percentage of deaths with race/ethnicity.

In [ ]:
#@title
cdc_provisional_crdt_merged_df['percent_max_100'] = cdc_provisional_crdt_merged_df.percent.clip(upper=1)
cdc_provisional_crdt_merged_df['percent_reccs'] = cdc_provisional_crdt_merged_df.percent_max_100 * cdc_provisional_crdt_merged_df.cdc_known_cases_percent
provisional_state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
provisional_state_reccs_title = 'CDC Provisional Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name

scatter = CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, provisional_state_reccs_fields_dict, provisional_state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [ ]:
#@title
nyt_merged_provisional_df['percent_max_100'] = nyt_merged_provisional_df.percent.clip(upper=1)
nyt_merged_provisional_df['percent_reccs'] = nyt_merged_provisional_df.percent_max_100 * nyt_merged_provisional_df.percent_known_cases
county_reccs_provisional_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_provisional_title = state_reccs_title = 'CDC Provisional Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name

scatter_provisional = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter_provisional.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

We can get an overall measure of completeness if we look at the number of states in the top right corner of the scatterplot where the composite score is > 85% (where the dots turn dark blue) and > 50% (where the dots turn yellow).

In [ ]:
#@title
#PrintSummaryStats(crdt_merged_df, field='percent_reccs')
#PrintSummaryStats(nyt_cdc_known_merged_df, field='percent_reccs')
#greater_than_85_df = nyt_cdc_known_merged_df[nyt_cdc_known_merged_df['percent_reccs'] > .85]
#print('total pop > 85%: ', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nyt_cdc_known_merged_df[nyt_cdc_known_merged_df['percent_reccs'] > .50]
#print('total pop > 50%: ', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
In [ ]:
#@title
#PrintSummaryStats(cdc_provisional_crdt_merged_df, field='percent_reccs')
#PrintSummaryStats(nyt_merged_provisional_df, field='percent_reccs')
#greater_than_85_df = nyt_merged_provisional_df[nyt_merged_provisional_df['percent_reccs'] > .85]
#print('total pop > 85%: %d, %f', greater_than_85_df.total_pop.sum(), greater_than_85_df.total_pop.sum() / 328239523)
#greater_than_50_df = nyt_merged_provisional_df[nyt_merged_provisional_df['percent_reccs'] > .50]
#print('total pop > 50%: %d, %f', greater_than_50_df.total_pop.sum(), greater_than_50_df.total_pop.sum() / 328239523)
In [ ]:
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
    'Number of states with composite > 85%',
    '(as a percent of all states)',
    'Number of states with composite > 50%',
    '(as a percent of all states)',
    'Number of counties with composite > 85%',
    '(as a percent of all counties)',
    'Number of counties with composite > 50%',
    '(as a percent of all counties)',
    'Population in counties with composite > 85%',
    '(as a % of total U.S population – States + D.C.)',
    'Population in counties with composite > 50%',
    '(as a % of total U.S population – States + D.C.)',
]
crdt_metadata = [
    '44',
    '(86%)',
    '50',
    '(98%)',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
]
cdc_state_provisional_metadata = [
    '40',
    '(80%)',
    '50',
    '(98%)',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
    '-',
]
cdc_metadata = [
    '8',
    '(17%)',
    '33',
    '(69%)',
    '588',
    '(19%)',
    '1,530',
    '(49%)',
    '79M',
    '(24%)',
    '188M',
    '(57%)',
]
cdc_county_provisional_metadata = [
    '-',
    '-',
    '-',
    '-',
   '469',
    '(15%)',
    '629',
    '(20%)',
    '183M',
    '(56%)',
    '243M',
    '(74%)',
]
table_data = {'CRDT': crdt_metadata, 'CDC Case Surveillance': cdc_metadata, 'CDC Provisional Deaths State': cdc_state_provisional_metadata, 'CDC Provisional Deaths County': cdc_county_provisional_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[ ]:
CRDT CDC Case Surveillance CDC Provisional Deaths State CDC Provisional Deaths County
Number of states with composite > 85% 44 8 40 -
(as a percent of all states) (86%) (17%) (80%) -
Number of states with composite > 50% 50 33 50 -
(as a percent of all states) (98%) (69%) (98%) -
Number of counties with composite > 85% - 588 - 469
(as a percent of all counties) - (19%) - (15%)
Number of counties with composite > 50% - 1,530 - 629
(as a percent of all counties) - (49%) - (20%)
Population in counties with composite > 85% - 79M - 183M
(as a % of total U.S population – States + D.C.) - (24%) - (56%)
Population in counties with composite > 50% - 188M - 243M
(as a % of total U.S population – States + D.C.) - (57%) - (74%)

If we require that states or counties have 85% of total expected deaths with race/ethnicity, we can use the CDC Provisional Deaths State dataset for 80% of states vs. 17% from the CDC case dataset. We can use the CDC Provisional Deaths County dataset for 15% of counties vs. 19% from the CDC case dataset, where the Provisional Deaths dataset's counties account for 56% of the U.S. population vs. 24% from the CDC case dataset.

If we loosen that requirement to 50% of total expected deaths with race/ethnicity, we can use the CDC Provisional Deaths State dataset for 98% of states vs. 69% from the CDC case dataset. We can use the CDC Provisional Deaths County dataset for 20% of counties vs. 49% from the CDC case dataset, where the Provisional Deaths dataset's counties account for 74% of the U.S. population vs. 57% from the CDC case dataset.

If you are looking for state- or county-level deaths data with race/ethnicity, the CDC Provisional Deaths datasets are more complete than the CDC Case Surveillance dataset in almost every way: 98%-99% of the deaths reported have race/ethnicity, they generally do not lag far behind state website data as reported in the CRDT/NYT, and they are updated once a week rather than once a month.

However, there are a few tradeoffs with using the Provisional Deaths State dataset:

  • The data are cumulative, so you will not be able to analyze deaths over time unless you collect the weekly updates from the CDC.
  • Age and sex data are available at the state level only in a separate dataset. All the Provisional Deaths datasets are listed at the CDC site.

There are a few additional tradeoffs with using the Provisional Deaths County dataset:

  • Only 20% of counties are included in the county dataset even though those counties account for 78% of the U.S. population. Fewer counties in the Provisional Deaths dataset are above the 85% and 50% composite measure than in the CDC Case data, but those counties account for a larger percentage of the U.S. population.
  • The Provisional Deaths County dataset should not be used at the state or U.S. levels due to the exclusion of counties with fewer than 100 COVID-19 deaths, which are generally counties with small populations. The Provisional Deaths State dataset should be used instead. The county dataset could be used for trends across the U.S. in counties with large populations.

Appendix

Large county maps

To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.

In [ ]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_nyt_map.display()
In [ ]:
#@title
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)

cdc_known_county_map.display()
In [ ]:
#@title
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_known_or_na_county_map.display()
In [ ]:
#@title
county_completeness = CreateMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)
county_completeness.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [ ]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
)

cdc_nyt_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [ ]:
#@title
cdc_known_county_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
)

cdc_known_county_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [ ]:
#@title
map = CreateMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)

map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Geographic Notes

The CDC Provisional Deaths State dataset had separate rows for New York State and New York City; we combined them into New York State.

Data Citations and Disclaimers

  • CDC data full citation: Centers for Disease Control and Prevention, COVID-19 Response. COVID-19 Case Surveillance Data Access, Summary, and Limitations (version date: January 31, 2021).
  • Per the CDC data agreement: The CDC does not take responsibility for the scientific validity or accuracy of methodology, results, statistical analyses, or conclusions presented.
  • CDC Provisional Deaths Data: State, County, and dataset index. National Center for Health Statistics (NCHS), National Vital Statistics System. Estimates are based on provisional data.
  • Population data: U.S. Census Bureau's American Community Survey 2019 5-year estimates accessed via API; e.g., sample query.
  • Covid Racial Data Tracker data: Available in a public spreadsheet.
  • New York Times data: Available as a public CSV file or via Google Cloud Public Datasets.

Contact information

Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.

In [ ]:
#%%shell
#jupyter nbconvert --to html 'cdc_death_data.ipynb'